── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag() masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
2.2 Loading in Unfiltered Dataframe
PT106_merged_df <-readRDS("./PT106_merged_df.rds")PT108_merged_df <-readRDS("./PT108_merged_df.rds")PT111_merged_df <-readRDS("./PT111_merged_df.rds")PT113_merged_df <-readRDS("./PT113_merged_df.rds")PT114_merged_df <-readRDS( "./PT114_merged_df.rds")PT116_merged_df <-readRDS("./PT116_merged_df.rds")PT117_merged_df <-readRDS("./PT117_merged_df.rds")PT118_merged_df <-readRDS("./PT118_merged_df.rds")PT119_merged_df <-readRDS("./PT119_merged_df.rds")PT122_merged_df <-readRDS("./PT122_merged_df.rds")patient_dfs_postnivo <-list(PT108 = PT108_merged_df,PT113 = PT113_merged_df,PT114 = PT114_merged_df, PT116 = PT116_merged_df,PT117 = PT117_merged_df, PT118 = PT118_merged_df, PT119 = PT119_merged_df,PT122 = PT122_merged_df)# Find common columns across all dataframes (trying to merge across dataframes)common_cols_postnivo <-Reduce(intersect, lapply(patient_dfs_postnivo, colnames))# Subset all dataframes to only those columnspatient_dfs_postnivo <-lapply(patient_dfs_postnivo, function(df) df[,common_cols_postnivo])# Combine all dataframes into one list, adding patient IDcombined_df_postnivo <-bind_rows(lapply(names(patient_dfs_postnivo), function(pid) { df <- patient_dfs_postnivo[[pid]] df$patient_id <- pidreturn(df) }))df_matrix_postnivo <- combined_df_postnivo %>%mutate(clone_label =paste(clonotype_id, patient_id, sep ="_")) %>% dplyr::select(clone_label, Prevaccine_umi_fraction, Postvaccine_umi_fraction, Postnivo_umi_fraction)umi_matrix_postnivo <-as.matrix(df_matrix_postnivo[, -1])rownames(umi_matrix_postnivo) <- df_matrix_postnivo$clone_label
# Analogous to Elbow Method for finding the optimal number of clustersset.seed(4)# plot k = 2 to k = 15.k.max <-15data <- umi_matrix_postnivowss <-sapply(1:k.max, function(k){kmeans(data, k, iter.max =50)$tot.withinss})plot(1:k.max, wss,type="b", pch =19, frame =FALSE, xlab="Number of clusters K",ylab="Total within-clusters sum of squares")
2.5 Plotting the centroids and the SD from the centroids
# Get cluster assignment for each rowcluster_assignments <- km_postnivo$clusterscaled_mat <-as.data.frame(scaled_umi_matrix_postnivo)scaled_mat$cluster <-factor(cluster_assignments)scaled_mat$id <-rownames(scaled_mat)# Reshape to long formatlong_scaled <- scaled_mat %>%pivot_longer(cols =c(Prevaccine_umi_fraction, Postvaccine_umi_fraction, Postnivo_umi_fraction),names_to ="timepoint", values_to ="value")centroid_long <- km_postnivo$centers %>%as.data.frame() %>%mutate(cluster =factor(rownames(.))) %>%pivot_longer(cols =-cluster, # pivot all columns except 'cluster'names_to ="timepoint",values_to ="centroid_value" )# Join data to get centroid per cluster + timepointjoined <- long_scaled %>%left_join(centroid_long, by =c("cluster", "timepoint")) %>%mutate(sq_diff = (value - centroid_value)^2)# Calculate SD around the centroid (not the mean)summary_stats <- joined %>%group_by(cluster, timepoint, centroid_value) %>%summarise(sd =sqrt(mean(sq_diff)),lower = centroid_value - sd,upper = centroid_value + sd,.groups ="drop" )
Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in
dplyr 1.1.0.
ℹ Please use `reframe()` instead.
ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()`
always returns an ungrouped data frame and adjust accordingly.